@article{adel2015syntactic,
  title={Syntactic and semantic features for code-switching factored language models},
  author={Adel, Heike and Vu, Ngoc Thang and Kirchhoff, Katrin and Telaar, Dominic and Schultz, Tanja},
  journal={IEEE/ACM transactions on audio, speech, and language Processing},
  volume={23},
  number={3},
  pages={431--440},
  year={2015},
  publisher={IEEE}
}
@article{ahn2016neural,
  title={A neural knowledge language model},
  author={Ahn, Sungjin and Choi, Heeyoul and P{\"a}rnamaa, Tanel and Bengio, Yoshua},
  journal={arXiv preprint arXiv:1608.00318},
  year={2016}
}
@inproceedings{alexandrescu2006factored,
  title={Factored neural language models},
  author={Alexandrescu, Andrei and Kirchhoff, Katrin},
  booktitle={Proceedings of the Human Language Technology Conference of the NAACL, Companion Volume: Short Papers},
  pages={1--4},
  year={2006}
}
@article{bahdanau2014neural,
  title={Neural machine translation by jointly learning to align and translate},
  author={Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
  journal={arXiv preprint arXiv:1409.0473},
  year={2014}
}
@inproceedings{bengio2003quick,
  title={Quick Training of Probabilistic Neural Nets by Importance Sampling.},
  author={Bengio, Yoshua and Sen{\'e}cal, Jean-S{\'e}bastien and others},
  booktitle={AISTATS},
  pages={1--9},
  year={2003}
}
@article{bengio2008adaptive,
  title={Adaptive importance sampling to accelerate training of a neural probabilistic language model},
  author={Bengio, Yoshua and Sen{\'e}cal, Jean-S{\'e}bastien},
  journal={IEEE Transactions on Neural Networks},
  volume={19},
  number={4},
  pages={713--722},
  year={2008},
  publisher={IEEE}
}
@article{bengio2003neural,
  title={A neural probabilistic language model},
  author={Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
  journal={Journal of machine learning research},
  volume={3},
  number={Feb},
  pages={1137--1155},
  year={2003}
}
@article{devlin2018bert,
  title={Bert: Pre-training of deep bidirectional transformers for language understanding},
  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  journal={arXiv preprint arXiv:1810.04805},
  year={2018}
}
@article{enarvi2016theanolm,
  title={Theanolm-an extensible toolkit for neural network language modeling},
  author={Enarvi, Seppo and Kurimo, Mikko},
  journal={arXiv preprint arXiv:1605.00942},
  year={2016}
}
@article{grave2016improving,
  title={Improving neural language models with a continuous cache},
  author={Grave, Edouard and Joulin, Armand and Usunier, Nicolas},
  journal={arXiv preprint arXiv:1612.04426},
  year={2016}
}
@inproceedings{graves2013hybrid,
  title={Hybrid speech recognition with deep bidirectional LSTM},
  author={Graves, Alex and Jaitly, Navdeep and Mohamed, Abdel-rahman},
  booktitle={2013 IEEE workshop on automatic speech recognition and understanding},
  pages={273--278},
  year={2013},
  organization={IEEE}
}
@inproceedings{huang2014cache,
  title={Cache based recurrent neural network language model inference for first pass speech recognition},
  author={Huang, Zhiheng and Zweig, Geoffrey and Dumoulin, Benoit},
  booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={6354--6358},
  year={2014},
  organization={IEEE}
}
@inproceedings{hwang2017character,
  title={Character-level language modeling with hierarchical recurrent neural networks},
  author={Hwang, Kyuyeon and Sung, Wonyong},
  booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  pages={5720--5724},
  year={2017},
  organization={IEEE}
}
@article{jelinek1977perplexity,
  title={Perplexity—a measure of the difficulty of speech recognition tasks},
  author={Jelinek, Fred and Mercer, Robert L and Bahl, Lalit R and Baker, James K},
  journal={The Journal of the Acoustical Society of America},
  volume={62},
  number={S1},
  pages={S63--S63},
  year={1977},
  publisher={Acoustical Society of America}
}
@inproceedings{kim2016character,
  title={Character-aware neural language models},
  author={Kim, Yoon and Jernite, Yacine and Sontag, David and Rush, Alexander M},
  booktitle={Thirtieth AAAI Conference on Artificial Intelligence},
  year={2016}
}
@article{le2012structured,
  title={Structured output layer neural network language models for speech recognition},
  author={Le, Hai-Son and Oparin, Ilya and Allauzen, Alexandre and Gauvain, Jean-Luc and Yvon, Fran{\c{c}}ois},
  journal={IEEE Transactions on Audio, Speech, and Language Processing},
  volume={21},
  number={1},
  pages={197--206},
  year={2012},
  publisher={IEEE}
}
@inproceedings{mei2017coherent,
  title={Coherent dialogue with attention-based language models},
  author={Mei, Hongyuan and Bansal, Mohit and Walter, Matthew R},
  booktitle={Thirty-First AAAI Conference on Artificial Intelligence},
  year={2017}
}
@inproceedings{mikolov2010recurrent,
  title={Recurrent neural network based language model},
  author={Mikolov, Tom{\'a}{\v{s}} and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
  booktitle={Eleventh annual conference of the international speech communication association},
  year={2010}
}
@inproceedings{mikolov2011extensions,
  title={Extensions of recurrent neural network language model},
  author={Mikolov, Tom{\'a}{\v{s}} and Kombrink, Stefan and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\`y}, Jan and Khudanpur, Sanjeev},
  booktitle={2011 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
  pages={5528--5531},
  year={2011},
  organization={IEEE}
}
@inproceedings{mikolov2011rnnlm,
  title={Rnnlm-recurrent neural network language modeling toolkit},
  author={Mikolov, Tomas and Kombrink, Stefan and Deoras, Anoop and Burget, Lukar and Cernocky, Jan},
  booktitle={Proc. of the 2011 ASRU Workshop},
  pages={196--201},
  year={2011}
}
@article{mikolov2012subword,
  title={Subword language modeling with neural networks},
  author={Mikolov, Tom{\'a}{\v{s}} and Sutskever, Ilya and Deoras, Anoop and Le, Hai-Son and Kombrink, Stefan and Cernocky, Jan},
  journal={preprint (http://www. fit. vutbr. cz/imikolov/rnnlm/char. pdf)},
  volume={8},
  year={2012}
}
@article{mikolov2013efficient,
  title={Efficient estimation of word representations in vector space},
  author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
  journal={arXiv preprint arXiv:1301.3781},
  year={2013}
}
@article{miyamoto2016gated,
  title={Gated word-character recurrent language model},
  author={Miyamoto, Yasumasa and Cho, Kyunghyun},
  journal={arXiv preprint arXiv:1606.01700},
  year={2016}
}
@inproceedings{mnih2009scalable,
  title={A scalable hierarchical distributed language model},
  author={Mnih, Andriy and Hinton, Geoffrey E},
  booktitle={Advances in neural information processing systems},
  pages={1081--1088},
  year={2009}
}
@inproceedings{morin2005hierarchical,
  title={Hierarchical probabilistic neural network language model.},
  author={Morin, Frederic and Bengio, Yoshua},
  booktitle={Aistats},
  volume={5},
  pages={246--252},
  year={2005},
  organization={Citeseer}
}
@article{peters2018deep,
  title={Deep contextualized word representations},
  author={Peters, Matthew E and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
  journal={arXiv preprint arXiv:1802.05365},
  year={2018}
}
@article{radford2018improving,
  title={Improving language understanding by generative pre-training},
  author={Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
  journal={URL https://s3-us-west-2. amazonaws. com/openai-assets/researchcovers/languageunsupervised/language understanding paper. pdf},
  year={2018}
}
@inproceedings{schwenk2013cslm,
  title={CSLM-a modular open-source continuous space language modeling toolkit.},
  author={Schwenk, Holger},
  booktitle={INTERSPEECH},
  pages={1198--1202},
  year={2013}
}
@inproceedings{si2012impact,
  title={Impact of Word Classing on Recurrent Neural Network Language Model},
  author={Si, Yujing and Guo, Yuhong and Liu, Yong and Pan, Jielin and Yan, Yonghong},
  booktitle={2012 Third Global Congress on Intelligent Systems},
  pages={100--103},
  year={2012},
  organization={IEEE}
}
@inproceedings{soutner2012neural,
  title={Neural network language model with cache},
  author={Soutner, Daniel and Loose, Zden{\v{e}}k and M{\"u}ller, Lud{\v{e}}k and Pra{\v{z}}{\'a}k, Ale{\v{s}}},
  booktitle={International Conference on Text, Speech and Dialogue},
  pages={528--534},
  year={2012},
  organization={Springer}
}
@inproceedings{sundermeyer2012lstm,
  title={LSTM neural networks for language modeling},
  author={Sundermeyer, Martin and Schl{\"u}ter, Ralf and Ney, Hermann},
  booktitle={Thirteenth annual conference of the international speech communication association},
  year={2012}
}
@article{tran2016recurrent,
  title={Recurrent memory networks for language modeling},
  author={Tran, Ke and Bisazza, Arianna and Monz, Christof},
  journal={arXiv preprint arXiv:1601.01272},
  year={2016}
}
@inproceedings{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Advances in neural information processing systems},
  pages={5998--6008},
  year={2017}
}
@article{verwimp2017character,
  title={Character-word lstm language models},
  author={Verwimp, Lyan and Pelemans, Joris and Wambacq, Patrick and others},
  journal={arXiv preprint arXiv:1704.02813},
  year={2017}
}
@article{wang2015larger,
  title={Larger-context language modelling},
  author={Wang, Tian and Cho, Kyunghyun},
  journal={arXiv preprint arXiv:1511.03729},
  year={2015}
}
@inproceedings{wu2012factored,
  title={Factored language model based on recurrent neural network},
  author={Wu, Youzheng and Lu, Xugang and Yamamoto, Hitoshi and Matsuda, Shigeki and Hori, Chiori and Kashioka, Hideki},
  booktitle={Proceedings of COLING 2012},
  pages={2835--2850},
  year={2012}
}
@inproceedings{xu2000can,
  title={Can artificial neural networks learn language models?},
  author={Xu, Wei and Rudnicky, Alex},
  booktitle={Sixth international conference on spoken language processing},
  year={2000}
}
@inproceedings{zweig2013speed,
  title={Speed regularization and optimality in word classing},
  author={Zweig, Geoffrey and Makarychev, Konstantin},
  booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing},
  pages={8237--8241},
  year={2013},
  organization={IEEE}
}